notebook.community

Edit and run

All pandas objects are value mutable but may not be size mutable. Lenght of a Series cannot be changed but columns can be inserted into a DataFrame.



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [5]:

    
# Creating a Series Object
s =  pd.Series([1,2,3,np.nan, 'a', 4+5j])
s









    Out[5]:





0         1
1         2
2         3
3       NaN
4         a
5    (4+5j)
dtype: object



In [8]:

    
# To generate dates
dates = pd.date_range('20130125', periods = 8)
dates









    Out[8]:





DatetimeIndex(['2013-01-25', '2013-01-26', '2013-01-27', '2013-01-28',
               '2013-01-29', '2013-01-30', '2013-01-31', '2013-02-01'],
              dtype='datetime64[ns]', freq='D')



In [12]:

    
# Creating a DataFrame
dates = pd.date_range('20180101', periods = 6)
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns = list('ABCD'))
df



In [15]:

    
# Creating a DataFrame from a dictionary
temp_dict = {'A':1,
            'B':pd.Series([2,3,4,5]),
            'C':np.array([3]*4, dtype='int32')}
df2 = pd.DataFrame(temp_dict)
df2



In [16]:

    
# To get the dtype for every column
df2.dtypes









    Out[16]:





A    int64
B    int64
C    int32
dtype: object



In [20]:

    
# To get the first x entries use pd.head(x) default = 5
df.head()



In [21]:

    
# To view last x entries
df.tail(3)



In [22]:

    
# To get the index of the dataframe
df.index









    Out[22]:





DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')



In [23]:

    
# To get index of columns
df.columns









    Out[23]:





Index(['A', 'B', 'C', 'D'], dtype='object')



In [24]:

    
# To get a quick statistic summary of a dataFrame
df.describe()



In [26]:

    
# To transpose your DataFrame
df.T









    Out[26]:







  
    
      
      2018-01-01 00:00:00
      2018-01-02 00:00:00
      2018-01-03 00:00:00
      2018-01-04 00:00:00
      2018-01-05 00:00:00
      2018-01-06 00:00:00
    
  
  
    
      A
      0.690510
      0.928369
      0.032481
      0.605776
      0.962642
      0.016343
    
    
      B
      0.290334
      0.994264
      0.079715
      0.584509
      0.736450
      0.898750
    
    
      C
      0.498668
      0.808196
      0.842365
      0.324846
      0.469180
      0.483067
    
    
      D
      0.724776
      0.509202
      0.575144
      0.378899
      0.705774
      0.883672



In [29]:

    
# To sort by an axis

# axis 0 mean along the columns i.e. the dates got sorted in descending order
df.sort_index(axis=0, ascending=False)



In [31]:

    
# To arrange along the columns
df.sort_index(axis=1, ascending=False)



In [34]:

    
# To sort by specific values
df.sort_values(by='C', ascending=False)



In [35]:

    
# For selecting elements



In [36]:

    
# To select columns
df['A']









    Out[36]:





2018-01-01    0.690510
2018-01-02    0.928369
2018-01-03    0.032481
2018-01-04    0.605776
2018-01-05    0.962642
2018-01-06    0.016343
Freq: D, Name: A, dtype: float64



In [37]:

    
df[0:3]



In [39]:

    
df[0:4].A









    Out[39]:





2018-01-01    0.690510
2018-01-02    0.928369
2018-01-03    0.032481
2018-01-04    0.605776
Freq: D, Name: A, dtype: float64



In [46]:

    
# To select by labels
dates









    Out[46]:





DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')



In [51]:

    
# To select a row
df.loc['2018-01-01']









    Out[51]:





A    0.690510
B    0.290334
C    0.498668
D    0.724776
Name: 2018-01-01 00:00:00, dtype: float64



In [53]:

    
# To select from multiple columns
df.loc[:, ['A', 'B']]



In [56]:

    
# To get the cell values
df.loc['2018-01-01', 'B']









    Out[56]:





0.29033430143768058



In [63]:

    
# Use this method for fast access than the previous one
df.at[dates[0], 'B']   # Where dates[0] = row name









    Out[63]:





0.29033430143768058



In [67]:

    
dates[0]









    Out[67]:





Timestamp('2018-01-01 00:00:00', freq='D')



In [68]:

    
df



In [66]:

    
# Now to operate on DataFrame using indexing without the index names
df.iloc[3]









    Out[66]:





A    0.605776
B    0.584509
C    0.324846
D    0.378899
Name: 2018-01-04 00:00:00, dtype: float64



In [69]:

    
# Here the first argument returns the number of rows and
# the second argument returns the number of columns
df.iloc[1:3 , 0:3]



In [76]:

    
# To select specific columns
df.iloc[[1,2,4], :]



In [71]:

    
df.iloc[0,1]









    Out[71]:





0.29033430143768058



In [72]:

    
# To get fast access use
df.iat[0,1]









    Out[72]:





0.29033430143768058



In [73]:

    
df.iat[0,1] = 1



In [74]:

    
df.at[dates[0], 'B'] = 2



In [75]:

    
df.iat[0,1]









    Out[75]:





2.0



In [79]:

    
# to make DataFrames by conditioning
df[df.A > 0.5]



In [83]:

    
df[df > 0.5]



In [95]:

    
# To add new columns to DataFrame
df['E'] = ['one', 'one','two','three','four','three']
df



In [87]:

    
df['F'] = pd.Series([1,2,3,4,5,6])
df



In [88]:

    
# To select specific rows with desired values
df[df['E'].isin(['two', 'three'])]



In [96]:

    
# Missing data
# By default represented by np.nan
df = df[df > 0.5]
df



In [97]:

    
del df['E']
df



In [99]:

    
# To change index of DataFrame
df.reindex(index=['one', 'two', 'three', 'four', 'five', 'six'], 
           columns=list('ZYXWV'))









    Out[99]:







  
    
      
      Z
      Y
      X
      W
      V
    
  
  
    
      one
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      two
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      three
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      four
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      five
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      six
      NaN
      NaN
      NaN
      NaN
      NaN



In [101]:

    
# To drop any row having missing values
df.dropna(how='any')
# If the row contains a single Nan it would be dropped



In [102]:

    
# All the entries in a row must by nan to get dropped
df.dropna(how='all')



In [104]:

    
# To replace Nan with desired values
df.fillna(value='Value Filled')









    Out[104]:







  
    
      
      A
      B
      C
      D
      F
    
  
  
    
      2018-01-01
      0.69051
      2
      Value Filled
      0.724776
      Value Filled
    
    
      2018-01-02
      0.928369
      0.994264
      0.808196
      0.509202
      Value Filled
    
    
      2018-01-03
      Value Filled
      Value Filled
      0.842365
      0.575144
      Value Filled
    
    
      2018-01-04
      0.605776
      0.584509
      Value Filled
      Value Filled
      Value Filled
    
    
      2018-01-05
      0.962642
      0.73645
      Value Filled
      0.705774
      Value Filled
    
    
      2018-01-06
      Value Filled
      0.89875
      Value Filled
      0.883672
      Value Filled



In [106]:

    
# TO get boolean mask of the DataFrame
df.isna()
# False if not Nan and True if Nan









    Out[106]:







  
    
      
      A
      B
      C
      D
      F
    
  
  
    
      2018-01-01
      False
      False
      True
      False
      True
    
    
      2018-01-02
      False
      False
      False
      False
      True
    
    
      2018-01-03
      True
      True
      False
      False
      True
    
    
      2018-01-04
      False
      False
      True
      True
      True
    
    
      2018-01-05
      False
      False
      True
      False
      True
    
    
      2018-01-06
      True
      False
      True
      False
      True



In [107]:

    
# Various operations on a DataFrame



In [114]:

    
df = pd.DataFrame(10*np.random.rand(6,4),
                 index = pd.date_range('20180101', periods=6),
                 columns=list('ABCD'))
df



In [117]:

    
df.mean() # along axis = 0









    Out[117]:





A    4.705890
B    5.669860
C    5.551811
D    5.100748
dtype: float64



In [118]:

    
# To apply some random function
df.apply(lambda x: x.max()-x.min())









    Out[118]:





A    6.476684
B    8.639543
C    7.207357
D    8.545320
dtype: float64



In [120]:

    
# To get value counts
df['A'].value_counts()









    Out[120]:





8.917154    1
4.087635    1
2.895181    1
2.440471    1
3.427748    1
6.467148    1
Name: A, dtype: int64



In [123]:

    
# To merge DataFrames use merge
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

print(left)
print(right)

pd.merge(left, right)









    



   key  lval
0  foo     1
1  foo     2
   key  rval
0  foo     4
1  foo     5






    Out[123]:







  
    
      
      key
      lval
      rval
    
  
  
    
      0
      foo
      1
      4
    
    
      1
      foo
      1
      5
    
    
      2
      foo
      2
      4
    
    
      3
      foo
      2
      5

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.539354	0.597337	0.571054	0.629578
std	0.421480	0.354993	0.206797	0.178732
min	0.016343	0.079715	0.324846	0.378899
25%	0.175805	0.363878	0.472652	0.525688
50%	0.648143	0.660480	0.490868	0.640459
75%	0.868904	0.858175	0.730814	0.720026
max	0.962642	0.994264	0.842365	0.883672

	A	B	C	D
2018-01-01	8.917154	3.445613	5.064817	4.348126
2018-01-02	2.895181	8.083756	8.989853	8.996414
2018-01-03	2.440471	4.622014	7.497869	5.880346
2018-01-04	4.087635	7.287993	6.382507	6.802648
2018-01-05	3.427748	0.970122	1.782497	0.451094
2018-01-06	6.467148	9.609664	3.593324	4.125862

	A	B	C	D
2018-01-01	0.690510	0.290334	0.498668	0.724776
2018-01-02	0.928369	0.994264	0.808196	0.509202
2018-01-03	0.032481	0.079715	0.842365	0.575144
2018-01-04	0.605776	0.584509	0.324846	0.378899
2018-01-05	0.962642	0.736450	0.469180	0.705774
2018-01-06	0.016343	0.898750	0.483067	0.883672

	A	B	C	D
2018-01-01	0.690510	2.000000	0.498668	0.724776
2018-01-02	0.928369	0.994264	0.808196	0.509202
2018-01-04	0.605776	0.584509	0.324846	0.378899
2018-01-05	0.962642	0.736450	0.469180	0.705774

	Z	Y	X	W	V
one	NaN	NaN	NaN	NaN	NaN
two	NaN	NaN	NaN	NaN	NaN
three	NaN	NaN	NaN	NaN	NaN
four	NaN	NaN	NaN	NaN	NaN
five	NaN	NaN	NaN	NaN	NaN
six	NaN	NaN	NaN	NaN	NaN

	A	B	C	D	F
2018-01-01	0.69051	2	Value Filled	0.724776	Value Filled
2018-01-02	0.928369	0.994264	0.808196	0.509202	Value Filled
2018-01-03	Value Filled	Value Filled	0.842365	0.575144	Value Filled
2018-01-04	0.605776	0.584509	Value Filled	Value Filled	Value Filled
2018-01-05	0.962642	0.73645	Value Filled	0.705774	Value Filled
2018-01-06	Value Filled	0.89875	Value Filled	0.883672	Value Filled

	A	B	C	D	F
2018-01-01	False	False	True	False	True
2018-01-02	False	False	False	False	True
2018-01-03	True	True	False	False	True
2018-01-04	False	False	True	True	True
2018-01-05	False	False	True	False	True
2018-01-06	True	False	True	False	True